library(readxl)
# read_excel reads both xls and xlsx files
uniteddata=read_excel("Take_home_Stats_test_Data.xlsx")
DT::datatable(uniteddata)

Model One

For a unit change in X1 keeping X2 constant, the mean of Y increases by 2.149586 and unit change in X2 keeping X1 constant, the mean of Y increases by 3.465170

model1=glm(Y~X1+X2,data=uniteddata)

broom::tidy(model1)
##          term  estimate std.error statistic      p.value
## 1 (Intercept) 16.539317 5.3464912  3.093490 2.118082e-03
## 2          X1  2.149586 0.1278502 16.813317 2.740094e-48
## 3          X2  3.465170 0.3715703  9.325743 7.829028e-19
head(broom::augment(model1))
##          Y       X1       X2  .fitted  .se.fit     .resid        .hat
## 1 220.3576 52.82055 25.69345 219.1138 2.051231  1.2438138 0.006062251
## 2 201.8625 52.50277 21.01730 202.2271 1.609600 -0.3645488 0.003732850
## 3 179.3549 37.34995 21.16060 170.1513 1.756099  9.2035769 0.004443272
## 4 148.6372 32.85682 19.37569 154.3079 1.791368 -5.6707126 0.004623540
## 5 203.9518 49.15237 26.37978 213.6070 2.374467 -9.6551807 0.008123389
## 6 173.1624 30.51970 19.17214 148.5788 1.968793 24.5836232 0.005584767
##     .sigma      .cooksd  .std.resid
## 1 26.37813 4.559422e-06  0.04735635
## 2 26.37820 2.400401e-07 -0.01386341
## 3 26.37413 1.823760e-04  0.35012737
## 4 26.37666 7.207070e-05 -0.21574780
## 5 26.37371 3.696804e-04 -0.36798830
## 6 26.34910 1.639249e-03  0.93575990
broom::glance(model1)
##   null.deviance df.null    logLik      AIC      BIC deviance df.residual
## 1       1107792     399 -1874.581 3757.161 3773.127 275540.6         397

Exploratory Data Analysis

Y=uniteddata$Y
X2=uniteddata$X2
#formula <- y ~ poly(x, 3, raw = TRUE)
formula <- Y ~ X2
#formula <- y ~ poly(x, raw = TRUE)
fill <- "#4271AE"
line <- "#1F3552"

#p <-
  ggplot(uniteddata, aes(x=X1, y=Y)) + geom_point(shape=1) + geom_smooth(method=lm, se=FALSE) +
      ggtitle(" regression line") +
      scale_x_continuous(name = "X1") +
      scale_y_continuous(name = "Y") +
theme_economist() +
      theme(axis.line.x = element_line(size=.5, colour = "black"),
            axis.line.y = element_line(size=.5, colour = "black"),
            axis.text.x=element_text(colour="black", size = 9),
            axis.text.y=element_text(colour="black", size = 9),
            panel.grid.major = element_line(colour = "#d3d3d3"),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(), panel.background = element_blank(),
            plot.title = element_text(family = "Tahoma"),
            text=element_text(family="Tahoma"))+
  stat_poly_eq(aes(label =  paste(..eq.label.., ..adj.rr.label.., sep = "~~~~")),
                   formula = formula, rr.digits = 3, coef.digits = 2, parse = TRUE)

#ggplotly(p)
Y=uniteddata$Y
X2=uniteddata$X2
formula <- Y ~ X2
fill <- "#4271AE"
line <- "#1F3552"

ggplot(uniteddata, aes(x=X2, y=Y)) + geom_point(shape=1) + geom_smooth(method=lm, se=FALSE) +
      ggtitle(" regression line") +
      scale_x_continuous(name = "X1") +
      scale_y_continuous(name = "Y") +
      theme(axis.line.x = element_line(size=.5, colour = "black"),
            axis.line.y = element_line(size=.5, colour = "black"),
            axis.text.x=element_text(colour="black", size = 9),
            axis.text.y=element_text(colour="black", size = 9),
            panel.grid.major = element_line(colour = "#d3d3d3"),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(), panel.background = element_blank(),
            plot.title = element_text(family = "Tahoma"),
            text=element_text(family="Tahoma"))+
   stat_poly_eq(formula = y ~ x, eq.with.lhs=FALSE,
      aes(label = paste("hat(italic(y))","~`=`~",..eq.label..,"~~~", ..rr.label.., sep = "")),parse = TRUE)

fill <- "#4271AE"
line <- "#1F3552"

p <-ggplot(uniteddata, aes(x=X1, y=Y)) + geom_point(shape=1) + geom_smooth(method=lm, se=FALSE) +
      ggtitle(" regression line") +
      scale_x_continuous(name = "X1") +
      scale_y_continuous(name = "Y") +
theme_economist() +
      theme(axis.line.x = element_line(size=.5, colour = "black"),
            axis.line.y = element_line(size=.5, colour = "black"),
            axis.text.x=element_text(colour="black", size = 9),
            axis.text.y=element_text(colour="black", size = 9),
            panel.grid.major = element_line(colour = "#d3d3d3"),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(), panel.background = element_blank(),
            plot.title = element_text(family = "Tahoma"),
            text=element_text(family="Tahoma"))
ggplotly(p)

Checking Regression Assumptions 1) The mean of the residual errors is close to 0.

mean(model1$residuals)
## [1] 2.161661e-14
#summary(model1)
#model1$residuals
#model1$fitted
#dataplot=data_frame(residuals=model1$residuals,fitted=model1$fitted,stdred=model1$std.resid)
par(mfrow = c(1, 2))
library(ggfortify)

ggplot2::autoplot(model1, which = 1:6, ncol = 3, label.size = F)+ theme_bw()

library(ggfortify)
myfortdata = fortify(USArrests)
myfortdata%>%head()%>%DT::datatable()
#head(myfortdata)%>%head()%>%DT::datatable()

dataplot=data_frame(residuals=model1$residuals,fitted=model1$fitted)

p1<-ggplot(dataplot, aes(fitted, residuals))+geom_point()+
    geom_smooth(method="loess")+geom_hline(yintercept=0, col="red", linetype="dashed")+
    xlab("Fitted values")+ylab("Residuals")+
    ggtitle("Residual vs Fitted Plot")+theme_bw()+geom_point(shape=1,color="purple")
ggplotly(p1)
#===========================================================================
# 
# broom augment and cbind.data.frame  data add regression diagnostics to data
#===========================================================================

par(mfrow=c(2,2))
myudata=cbind.data.frame(uniteddata,broom::augment(model1))
p=ggplot(data = myudata, aes(x = .fitted, y = .resid)) +
  geom_hline(yintercept = 0, colour = "firebrick3") +geom_smooth(method="loess")+
 geom_point()+xlab("Fitted values")+ylab("Residuals")+ggtitle("Residual vs Fitted Plot")+theme_bw()
ggplotly(p)
#===========================================================================
# 
# fortify data add regression diagnostics to data
#===========================================================================

myundata= fortify(model1)

pp=ggplot(data = myudata, aes(x = .fitted, y = .resid)) +
  geom_hline(yintercept = 0, colour = "firebrick3") +
  geom_point() +
  geom_smooth(se = FALSE)+theme_bw()+
  xlab("Fitted Values")+ylab("Residuals")+
  ggtitle("Homoscedasticity assumption")
ggplotly(pp)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
myundata%>%head()
##          Y       X1       X2        .hat   .sigma      .cooksd  .fitted
## 1 220.3576 52.82055 25.69345 0.006062251 26.37813 4.559422e-06 219.1138
## 2 201.8625 52.50277 21.01730 0.003732850 26.37820 2.400401e-07 202.2271
## 3 179.3549 37.34995 21.16060 0.004443272 26.37413 1.823760e-04 170.1513
## 4 148.6372 32.85682 19.37569 0.004623540 26.37666 7.207070e-05 154.3079
## 5 203.9518 49.15237 26.37978 0.008123389 26.37371 3.696804e-04 213.6070
## 6 173.1624 30.51970 19.17214 0.005584767 26.34910 1.639249e-03 148.5788
##       .resid   .stdresid
## 1  1.2438138  0.04735635
## 2 -0.3645488 -0.01386341
## 3  9.2035769  0.35012737
## 4 -5.6707126 -0.21574780
## 5 -9.6551807 -0.36798830
## 6 24.5836232  0.93575990
p2=ggplot(data = myundata, aes(sample = .stdresid)) +
  stat_qq() +geom_abline(colour = "firebrick3")+
    ggtitle("Theoritcal Quantiles/Normality assumption")+theme_bw()
ggplotly(p2)
p3=ggplot(data = myundata, aes(x = .fitted, y = .stdresid)) +
  geom_hline(yintercept = 0, colour = "firebrick3") +
  geom_point()+ geom_smooth(se=T,method = 'loess')+theme_bw()+
  ggtitle("Standadized Residuals vs Fitted")+
  xlab("Fitted Value")+
ylab((("Standardized residuals|")))
ggplotly(p3)
p4=ggplot(data = myundata, aes(x = .hat, y = .stdresid)) +
  geom_point() +
  geom_smooth(se = FALSE,method = 'loess')+ geom_smooth(se=T,method = 'loess')+theme_bw()+
  ggtitle("Residuals vs. leverages")+
  xlab("Fitted Value")+
ylab("Standardized residuals|")
ggplotly(p4) 
# Points size reflecting Cook's distance
p5=ggplot(data = myundata, aes(x = .fitted, y = .resid, size = .cooksd)) +
  geom_hline(yintercept = 0, colour = "firebrick3") +
  geom_point() +scale_size_area("Cook’s distance")+
  ggtitle("Points size reflecting Cook's distance")+
  theme_bw()+xlab("Fitted Values")+ylab("Residuals")
ggplotly(p5) 
# Residuals vs. leverages with observation number
p6=ggplot(data = myundata, aes(x = .hat, y = .stdresid)) +
  geom_point() +
  geom_smooth(se = FALSE,method = 'loess') +
  geom_text(label = rownames(myundata), size = 4) +
  geom_hline(yintercept = 2, lty = "dotted", colour = "slateblue1") +
  geom_hline(yintercept = -2, lty = "dotted", colour = "slateblue1")+
  theme_bw()+xlab("Fitted Values")+ylab("Standidized Residuals")+theme_bw()
ggplotly(p6) 
p7=ggplot(data = myundata, aes(x =X1, y = Y)) +
  geom_point() +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), colour = "red", se = FALSE) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3), colour = "blue", se = FALSE)+
  theme_bw()
ggplotly(p7)
#we define our own function mean of log
#and change geom to line
meanlog <- function(y) {mean(log(y))}
ggplot(myundata, aes(x=X1, y=Y)) + 
  stat_summary(fun.y="meanlog", geom="line")

tp <- ggplot(myundata, aes(Y))+geom_density()
tp

  1. Homoscedasticity of residuals or equal variance. This assumption is satisfied by normality of the residuals from the plots below.

3)No autocorrelation of residuals present from the plot. the second line in the plot is close to 0.

# Method 1: Visualise with acf plot

acf(model1$residuals)

# Method 3: Durbin-Watson test
lmtest::dwtest(model1)
## 
##  Durbin-Watson test
## 
## data:  model1
## DW = 1.9857, p-value = 0.4432
## alternative hypothesis: true autocorrelation is greater than 0

5)The X variables and residuals are uncorrelated

cor.test(uniteddata$X1, model1$residuals) 
## 
##  Pearson's product-moment correlation
## 
## data:  uniteddata$X1 and model1$residuals
## t = -2.5554e-14, df = 398, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09805172  0.09805172
## sample estimates:
##           cor 
## -1.280924e-15
cor.test(uniteddata$X2, model1$residuals) 
## 
##  Pearson's product-moment correlation
## 
## data:  uniteddata$X2 and model1$residuals
## t = -1.2686e-14, df = 398, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09805172  0.09805172
## sample estimates:
##           cor 
## -6.358721e-16

5)The variability in X values is positive.The X variables in the sample must all not be the same or nearly the same.

var(uniteddata$X1)
## [1] 215.3105
var(uniteddata$X2)
## [1] 25.49096
  1. No perfect multicollinearity There is no perfect linear relationship between explanatory variables.The varianc einflation factor values is small(<10) for both explanatory variables signifying no problem with multicollinearity.
vif(model1)
##       X1       X2 
## 2.023233 2.023233

The correlation between X1 and X2 is very high (0.7111551). This vilates the linear regression assumption.

library(corrplot)
## 
## Attaching package: 'corrplot'
## The following object is masked from 'package:pls':
## 
##     corrplot
corrplot(cor(uniteddata[, -c(1,2)]))

cor(uniteddata[, -c(1,2)])
##           X1        X2
## X1 1.0000000 0.7111551
## X2 0.7111551 1.0000000

Check assumptions automatically

#par(mfrow=c(2,2))  # draw 4 plots in same window
#library(gvlma)

model=lm(Y~X1+X2,data=uniteddata)

gvlma::gvlma(model)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = uniteddata)
## 
## Coefficients:
## (Intercept)           X1           X2  
##      16.539        2.150        3.465  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma::gvlma(x = model) 
## 
##                        Value p-value                   Decision
## Global Stat        8.759e+04 0.00000 Assumptions NOT satisfied!
## Skewness           3.126e+03 0.00000 Assumptions NOT satisfied!
## Kurtosis           8.446e+04 0.00000 Assumptions NOT satisfied!
## Link Function      9.031e-01 0.34194    Assumptions acceptable.
## Heteroscedasticity 3.494e+00 0.06161    Assumptions acceptable.
#DT:datatable(influence.measures(mod))
#DT::datatable(data.table(influence.measures(mod)))
#data.table(influence.measures(mod))
DT::datatable((influence.measures(model)$infmat))

Model Two

Principal components regression/Analysis or varoius forms of Factor Analysis can be used when the explanatory variables are correlated.

model2=pls::plsr(Y~X1+X2,data=uniteddata,ncomp=1)

summary(model2)
## Data:    X dimension: 400 2 
##  Y dimension: 400 1
## Fit method: kernelpls
## Number of components considered: 1
## TRAINING: % variance explained
##    1 comps
## X    95.07
## Y    72.09
3
## [1] 3
#biplot(model2)
#plot(model2, plottype = "biplot")
library(plsdepot)
model3=plsdepot::plsreg1(uniteddata[,3:4],uniteddata["Y"],comps=1, crosval=TRUE)


plot(model3)

The chi-square statistic and p-value in factanal are testing the hypothesis that the model fits the data perfectly. About 77.3% of the variance explained by one factors is very high.

factanal(uniteddata[,-1],1)
## 
## Call:
## factanal(x = uniteddata[, -1], factors = 1)
## 
## Uniquenesses:
##     Y    X1    X2 
## 0.111 0.217 0.354 
## 
## Loadings:
##    Factor1
## Y  0.943  
## X1 0.885  
## X2 0.803  
## 
##                Factor1
## SS loadings      2.318
## Proportion Var   0.773
## 
## The degrees of freedom for the model is 0 and the fit was 0

Model Two

The data here was restructured into three columns which are the number of trials(1,..13), Driving school(Yes/No ) and the Frequency of the number of Trials.

uniteddata2=data_frame(No.Trial=c(seq(1:13)),No=c(29,16,17,4,3,9,4,5,1,1,1,3,7)
,Yes=c(198,107,55,38,18,22,7,9,5,3,6,6,12))
 

m1=reshape2::melt(uniteddata2,id.vars ="No.Trial",variable.name ="Driving.school",value.name ="Frequency")                        
m2= reshape2::dcast(reshape2::melt(uniteddata2,id.vars ="No.Trial"),No.Trial~variable )

DT::datatable(m1)

Chisquare Test.

xtabs(No.Trial~Driving.school+Frequency, data=m1)
##               Frequency
## Driving.school  1  3  4  5  6  7  9 12 16 17 18 22 29 38 55 107 198
##            No  30 17 11  8  0 13  6  0  2  3  0  0  1  0  0   0   0
##            Yes  0 10  0  9 23  7  8 13  0  0  5  6  0  4  3   2   1
chisq.test(xtabs(No.Trial~Driving.school+Frequency, data=m1),simulate.p.value = T)%>%tidy()
##   statistic      p.value parameter
## 1  107.9594 0.0004997501        NA
##                                                                             method
## 1 Pearson's Chi-squared test with simulated p-value\n\t (based on 2000 replicates)

The frequency of those passing the driving test who attended a driving school is higher than those who didn’t attend a driving school. Attending a driving school increases your probability of passing a driving test.

p=ggplot(m1, aes(No.Trial,Frequency, color=Driving.school)) + 
  geom_line() + 
  geom_point()+theme_bw()+xlab("Number of Trials")
ggplotly(p)
  1. Distribution plot of the Frequency of Number of trials for both those who had training from driving school and those who didn’t. The distribution is skewed to the right. A logarithmic transformation could improve the skewness of the distribution.
pq1=qplot( m1$Frequency, geom="histogram",binwidth = 2)+theme_minimal()+xlab("Number of Trials")
ggplotly(pq1)

Taking log of the Frequency of Number of trials for both those who had training from driving school and those who didn’t. The regular assumption on the explanatory variables is that of normality.Even with the logarithmic transformation, the distribution doest appear to be normal.

pq=qplot(log( m1$Frequency), geom="histogram",binwidth = 0.5)+theme_minimal()+xlab("log(Number of Trials)")
ggplotly(pq)

For a unit increase in the number of students attending a driving-school driver to pass the test in a trial,the mean Frequency of passing thedriving test increases by 1.90802228 compared to those who did not attend driving school.

model4=glm(log(Frequency)~(No.Trial)+Driving.school+No.Trial*Driving.school,data=m1)
#summary(model4)

broom::tidy(model4)
##                         term    estimate  std.error statistic      p.value
## 1                (Intercept)  2.86791968 0.45198583  6.345154 2.194396e-06
## 2                   No.Trial -0.19326892 0.05694486 -3.393966 2.608602e-03
## 3          Driving.schoolYes  1.90802228 0.63920449  2.984995 6.827809e-03
## 4 No.Trial:Driving.schoolYes -0.08783224 0.08053220 -1.090647 2.872298e-01
head(broom::augment(model4))
##   log.Frequency. No.Trial Driving.school  .fitted   .se.fit     .resid
## 1       3.367296        1             No 2.674651 0.4026610  0.6926451
## 2       2.772589        2             No 2.481382 0.3556205  0.2912069
## 3       2.833213        3             No 2.288113 0.3118999  0.5451004
## 4       1.386294        4             No 2.094844 0.2730980 -0.7085496
## 5       1.098612        5             No 1.901575 0.2415966 -0.8029628
## 6       2.197225        6             No 1.708306 0.2205465  0.4889184
##         .hat    .sigma     .cooksd .std.resid
## 1 0.27472527 0.7660149 0.106138895  1.0586910
## 2 0.21428571 0.7830317 0.012468860  0.4276408
## 3 0.16483516 0.7754587 0.029745280  0.7764262
## 4 0.12637363 0.7687087 0.035213164 -0.9867728
## 5 0.09890110 0.7643337 0.033266457 -1.1010802
## 6 0.08241758 0.7783783 0.009912018  0.6643908
broom::glance(model4)
##   null.deviance df.null    logLik      AIC      BIC deviance df.residual
## 1      45.03364      25 -27.86532 65.73064 72.02112 12.98384          22
  1. A logistic regression of whether a person attended driving school or not on the number of trials and frequency of attempts in passing the driving test does have significant variables at a level of 0.05. The frequency of passing a test and the number of trials are significant at 0.05 level.
index <- createDataPartition(m1$Driving.school,p=0.70, list=FALSE)

trainSet <- m1[index,]

testSet <- m1[-index,]



model5=glm(Driving.school~No.Trial+log(Frequency),data=trainSet,family="binomial")



knitr::kable(confint(model5))
## Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) -19.3296658 -2.653230
No.Trial 0.1018008 1.352168
log(Frequency) 0.7677598 4.940666
tidy(model5)
##             term   estimate std.error statistic    p.value
## 1    (Intercept) -8.9546485 4.0593821 -2.205914 0.02739001
## 2       No.Trial  0.5811987 0.3061354  1.898502 0.05762998
## 3 log(Frequency)  2.2936612 1.0037003  2.285205 0.02230079

For a unit increase in the frequency of a passing a dricing test, log odds of passing a driving test when one has taking a driving test test increases by about 200% compared to those who did not take the driving test.

coef(model5)
##    (Intercept)       No.Trial log(Frequency) 
##     -8.9546485      0.5811987      2.2936612

The odds of passing a driving test when one has taking a driving test test increases by about 800% compared to those who did not take the driving test

## odds ratios only)

## odds ratios and 95% CI
exp(cbind(OR = coef(model5), confint(model5)))
## Waiting for profiling to be done...
##                          OR        2.5 %       97.5 %
## (Intercept)    0.0001291355 4.029330e-09   0.07042341
## No.Trial       1.7881805650 1.107163e+00   3.86579749
## log(Frequency) 9.9111581761 2.154933e+00 139.86343263
# Predict using the test data
#testSet 
pred<-predict(model5,testSet)


# 
# # Print, plot variable importance
 print(varImp(model5, scale = FALSE))
##                 Overall
## No.Trial       1.898502
## log(Frequency) 2.285205
# 
# plot(varImp(model5, scale = FALSE), main="Variable Importance using logistic/glm")
# 
# 
# confusionMatrix(testSet$Hypertension,pred)
 my_data=data.frame(cbind(predicted=pred,observed=testSet$Driving.school))
 
ggplot(my_data,aes(predicted,observed))+geom_point()+geom_smooth(method=lm)+ggtitle('logistic model')

predict <- predict(model5,testSet, type = 'response')

pp=if_else(predict >0.5,"Yes","No")

tab=table(testSet$Driving.school, pp)
tab
##      pp
##       No Yes
##   No   2   1
##   Yes  1   2
predict <- predict(model5,testSet, type = 'response')

predict=if_else(predict >0.5,"Yes","No")
#predict
data.frame(observed=testSet$Driving.school,predicted=predict)
##   observed predicted
## 1       No        No
## 2       No        No
## 3       No       Yes
## 4      Yes       Yes
## 5      Yes        No
## 6      Yes       Yes
#cbind(m1$Driving.school, predict)
#==================================================================
#ROCR Curve
#==================================================================




p <- ggplot(data.frame(trainSet), aes(d = as.numeric(Driving.school), m = Frequency)) + geom_roc()+ style_roc()


plot_interactive_roc(p)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!

## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!

The probability of be the probability of a no-driving-school driver to pass the test in a trial ,Pn is 0.1706485 and Ps be the probability of a driving-school driver to pass the test in a trial is 0.8293515. This is nearly 5 times the rate for those not attending driving school.

tapply(m1$Frequency,m1$Driving.school, sum)/sum(m1$Frequency)
##        No       Yes 
## 0.1706485 0.8293515